#This file is a long list of per model diagnostics which are reported in the main text and the appendix. 

#Please note you will have to run the commands in HCR - Replication Code - Main models.r to actually generate the models before you can use this to run the diagnostic. 

# Model 1.0 - 2.2 ----

#diagnostics http://www.ssc.wisc.edu/sscc/pubs/MM/MM_DiagInfer.html
#check: normality of DV, plots of residuals, linearity in key IV, normality of residuals, high leverage points
#observe whether any new models deviate from reported ones in terms of the direction and statistical significance of key IV effects
#singular fit problem. can be resolved by taking out some terms, doesn't affect the results. 

# * Model 1.0 ----

#Change model name here and then inspect results 
d.model <- m1.0outov
m.fm <- d.model@call
df <- posts
iv <- posts$cat0
dv <- log(posts$total_resps_outside_ov+1)

#normality of DV
#it's basically OK not perfect
hist(dv)

#residuals vs fitted values 
#ok with some artefact in the data
plot(d.model)

#linearity in variables
#categorical dv so this is a bit redundant
#i'm leaving it here for the next time i do this
ggplot(data.frame(x1=iv,pearson=residuals(d.model,type="pearson")), aes(x=x1,y=pearson)) + geom_boxplot() +  theme_bw()

#normality of residuals
#looks a bit off
hist(residuals(d.model))
#let's try something without any zero values. it's a bit better. same conclusion
m.lv <- lmer(m.fm, data=df[df$total_resps>0,])
hist(residuals(m.lv))
summary(m.lv)

#high leverage points. there aren't any. again leaving in place for the next time 
ggplot(data.frame(lev=hatvalues(d.model),pearson=residuals(d.model,type="pearson")), aes(x=lev,y=pearson)) + geom_point() + theme_bw()
#if there are some - does removing them make a difference?
levId <- which(hatvalues(d.model) >= .4)
#check
m.lv <- lmer(m.fm, data=df[-c(levId),])
summary(m.lv)

# * Model 1.1 ----

#Change model name here and then inspect results 
d.model <- m1.1outov
m.fm <- d.model@call
df <- posts
iv <- posts$cat
dv <- log(posts$total_resps_outside_ov+1)

#normality of DV
#it's basically OK not perfect
hist(dv)

#residuals vs fitted values 
#ok with some artefact in the data
plot(d.model)

#linearity in variables
#categorical dv so this is a bit redundant
#i'm leaving it here for the next time i do this
ggplot(data.frame(x1=iv,pearson=residuals(d.model,type="pearson")), aes(x=x1,y=pearson)) + geom_boxplot() +  theme_bw()

#normality of residuals
#looks a bit off
hist(residuals(d.model))
#let's try something without any zero values. it's a bit better. same conclusion
m.lv <- lmer(m.fm, data=df[df$total_resps>0,])
hist(residuals(m.lv))
summary(m.lv)
output_lmer(m.lv)#actually the result is signif 

#high leverage points. there aren't any. again leaving in place for the next time 
ggplot(data.frame(lev=hatvalues(d.model),pearson=residuals(d.model,type="pearson")), aes(x=lev,y=pearson)) + geom_point() + theme_bw()
#if there are some - does removing them make a difference?
levId <- which(hatvalues(d.model) >= .4)
#check
m.lv <- lmer(m.fm, data=df[-c(levId),])
summary(m.lv)

# * Model 1.2 ----

#Change model name here and then inspect results 
d.model <- m1.2outov
m.fm <- d.model@call
df <- posts_ov
iv <- posts_ov$cat
dv <- log(posts_ov$total_resps_outside_ov+1)

#normality of DV
#it's basically OK not perfect
hist(dv)

#residuals vs fitted values 
#ok with some artefact in the data
plot(d.model)

#linearity in variables
#categorical dv so this is a bit redundant
#i'm leaving it here for the next time i do this
ggplot(data.frame(x1=iv,pearson=residuals(d.model,type="pearson")), aes(x=x1,y=pearson)) + geom_boxplot() +  theme_bw()

#normality of residuals
#looks fine
hist(residuals(d.model))


#high leverage points. there aren't any. again leaving in place for the next time 
ggplot(data.frame(lev=hatvalues(d.model),pearson=residuals(d.model,type="pearson")), aes(x=lev,y=pearson)) + geom_point() + theme_bw()
#if there are some - does removing them make a difference?
levId <- which(hatvalues(d.model) >= .4)
#check
m.lv <- lmer(m.fm, data=df[-c(levId),])
summary(m.lv)


# * Model 2.0 ----

#Change model name here and then inspect results 
d.model <- m2.0outov
m.fm <- d.model@call
df <- posts_resps
iv <- posts_resps$cat
dv <- log(posts_resps$avg_usr_post_vols_before+1)

#normality of DV
#it's basically OK not perfect
hist(dv)

#residuals vs fitted values 
#ok with some artefact in the data probably at the 'zero' end of the range
plot(d.model)

#linearity in variables
#categorical dv so this is a bit redundant
#i'm leaving it here for the next time i do this
ggplot(data.frame(x1=iv,pearson=residuals(d.model,type="pearson")), aes(x=x1,y=pearson)) + geom_boxplot() +  theme_bw()

#normality of residuals
#looks fine, bit of skew.
hist(residuals(d.model))
qqnorm(residuals(d.model))


#let's try an at least 10 repsonses model
m.lv <- lmer(m.fm, data=df[df$total_resps>10,])
hist(residuals(m.lv))
qqnorm(residuals(m.lv))#better, still not perfect 
summary(m.lv)#results same 

#high leverage points. there aren't any. again leaving in place for the next time 
ggplot(data.frame(lev=hatvalues(d.model),pearson=residuals(d.model,type="pearson")), aes(x=lev,y=pearson)) + geom_point() + theme_bw()
#if there are some - does removing them make a difference?
levId <- which(hatvalues(d.model) >= .4)
#check
m.lv <- lmer(m.fm, data=df[-c(levId),])
summary(m.lv)

# * Model 2.1 ----

#Change model name here and then inspect results 
d.model <- m2.1outov
m.fm <- d.model@call
df <- posts_resps
iv <- posts_resps$cat
dv <- log(posts_resps$avg_usr_post_vols_before+1)

#normality of DV
#it's basically OK not perfect
hist(dv)

#residuals vs fitted values 
#ok with some artefact in the data probably at the 'zero' end of the range
plot(d.model)

#linearity in variables
#categorical dv so this is a bit redundant
#i'm leaving it here for the next time i do this
ggplot(data.frame(x1=iv,pearson=residuals(d.model,type="pearson")), aes(x=x1,y=pearson)) + geom_boxplot() +  theme_bw()

#normality of residuals
#looks fine, bit of skew.
hist(residuals(d.model))
qqnorm(residuals(d.model))


#let's try an at least 10 repsonses model
m.lv <- lmer(m.fm, data=df[df$total_resps>10,])
hist(residuals(m.lv))
qqnorm(residuals(m.lv))#better, still not perfect 
summary(m.lv)

#high leverage points. there aren't any. again leaving in place for the next time 
ggplot(data.frame(lev=hatvalues(d.model),pearson=residuals(d.model,type="pearson")), aes(x=lev,y=pearson)) + geom_point() + theme_bw()
#if there are some - does removing them make a difference?
levId <- which(hatvalues(d.model) >= .4)
#check
m.lv <- lmer(m.fm, data=df[-c(levId),])
summary(m.lv)



# * Model 2.2 ----

#Change model name here and then inspect results 
d.model <- m2.2outov
m.fm <- d.model@call
df <- posts_resps_ov
iv <- posts_resps_ov$cat
dv <- log(posts_resps_ov$avg_usr_post_vols_before+1)

#normality of DV
#it's basically OK not perfect
hist(dv)

#residuals vs fitted values 
#ok with some artefact in the data probably at the 'zero' end of the range
plot(d.model)

#linearity in variables
#categorical dv so this is a bit redundant
#i'm leaving it here for the next time i do this
ggplot(data.frame(x1=iv,pearson=residuals(d.model,type="pearson")), aes(x=x1,y=pearson)) + geom_boxplot() +  theme_bw()

#normality of residuals
#looks fine, bit of skew.
hist(residuals(d.model))
qqnorm(residuals(d.model))


#let's try an at least 10 repsonses model
m.lv <- lmer(m.fm, data=df[df$total_resps>10,])
hist(residuals(m.lv))
qqnorm(residuals(m.lv))#better, still not perfect 
summary(m.lv)

#high leverage points. there aren't any. again leaving in place for the next time 
ggplot(data.frame(lev=hatvalues(d.model),pearson=residuals(d.model,type="pearson")), aes(x=lev,y=pearson)) + geom_point() + theme_bw()
#if there are some - does removing them make a difference?
levId <- which(hatvalues(d.model) >= .4)
#check
m.lv <- lmer(m.fm, data=df[-c(levId),])
summary(m.lv)

# Model 3.0 & 3.1 ----

#note the truncated diagnostics are found in the main file

#* Model 3.0 ----
#http://www.sthda.com/english/wiki/cox-model-assumptions

#Prop hazards assumption. Violated. 
zp <- cox.zph(m3.0, transform='km')
zp

#basic plot for diagnostics 
plot(zp[1], resid=F)
abline(0,0, col=2)
abline(h= m3.1$coef[1], col=3, lwd=2, lty=2)

#crossing point 
4800/60


#linearity and correct functional form 
#this is a bit pointless as we've already logged and our key IV is categorical
#leaving it here for future reference
#good explanation here https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf
#ggcoxdiagnostics(model.2, linear.predictions = T)
#function is really heavy

#outliers 
dresids <- residuals(m3.0, type="deviance")
lp <- predict(m3.0, type="lp" )
plot(lp, dresids, xlab="Linear Predictor", ylab="Deviance Residual")
#that's not very informative

dfbeta <- residuals(m3.0, type="dfbeta")
colnames(dfbeta) <- names(m3.0$coef)
summary(dfbeta)

#cutoff used 2/sqrt(n) 
#Found in Influence Statistics and Outliers
#http://users.stat.ufl.edu/~winner/sta6127/influence.doc

cutoff <- 2/sqrt(nobs(m3.0))

dfbeta[dfbeta>cutoff]
dfbeta[dfbeta<(cutoff*-1)]

#nothing influential



#* Model 3.1 ----
#http://www.sthda.com/english/wiki/cox-model-assumptions

#Prop hazards assumption. Violated. 
zp <- cox.zph(m3.1, transform='km')
zp


#linearity and correct functional form 
#this is a bit pointless as we've already logged and our key IV is categorical
#leaving it here for future reference
#good explanation here https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf
#ggcoxdiagnostics(model.2, linear.predictions = T)
#function is really heavy

#outliers 
dresids <- residuals( m3.1, type="deviance" )
lp <- predict( m3.1, type="lp" )
plot(lp, dresids, xlab="Linear Predictor", ylab="Deviance Residual")

dfbeta <- residuals( m3.1, type="dfbeta" )
colnames(dfbeta) <- names(m3.1$coef)
summary(dfbeta)

#cutoff used 2/sqrt(n) 
#Found in Influence Statistics and Outliers
#http://users.stat.ufl.edu/~winner/sta6127/influence.doc

cutoff <- 2/sqrt(nobs(m3.1))

dfbeta[dfbeta>cutoff]
dfbeta[dfbeta<(cutoff*-1)]

#nothing influential

#* M3.0 Graphic ----
#* #This is the graphic which is shown in Figure A1

#Prop hazards assumption. Violated. 
zp <- cox.zph(m3.0, transform='km')
zp

d <- cbind.data.frame(data.frame(zp[1]$y), data.frame(dimnames(zp[1]$y)), 'x' = zp[1]$x)
colnames(d) <- c('strong', 'time', '.', 'xpos')
d$time <- as.numeric(as.character(d$time))

#basic plot for diagnostics 
plot(zp[1], resid=F)
abline(0,0, col=2)
abline(h= m3.0$coef[1], col=3, lwd=2, lty=2)

#strongly positive for first 80 minutes 
4800/60

#then it fluctuates between 0 and the observed effect

#turns negative after around 40 hours
160000/(60*60)

#This is the ggplot version which was actually used
#let's put human understandable breaks in
min1 <- 60
hour1 <- 60*min1
day1 <- 24*hour1
week1 <- 7*day1

timebreaks <- rbind.data.frame(
  data.frame(xpos = d[which.min(abs(d$time - min1)), ]$xpos,
             text = 't + one minute'),
  data.frame(xpos = d[which.min(abs(d$time - hour1)), ]$xpos,
             text = 't + one hour'),
  data.frame(xpos = d[which.min(abs(d$time - day1)), ]$xpos,
             text = 't + one day'),
  data.frame(xpos = d[which.min(abs(d$time - week1)), ]$xpos,
             text = 't + one week')
)

interceptbreaks <- rbind.data.frame(
  data.frame(xpos = 0.062,
             text = '5.5 minutes'),
  data.frame(xpos = 0.499,
             text = '22.8 hours')
)

#start cross
d[which.min(abs(d$xpos - 0.062)), ]$time / min1

#stop cross
d[which.min(abs(d$xpos - 0.499)), ]$time / hour1

#median gap was 35610 
35610 / (60*60)



ggplot(d, aes(x=xpos, y=strong)) + geom_smooth(color='dodgerblue4', fill='dodgerblue4') +
  geom_vline(data=timebreaks, aes(xintercept = xpos), linetype='dashed', color='indianred') +
  geom_text(data=timebreaks, aes(x = xpos+0.075, y = -3.5, label = text)) +
  geom_hline(yintercept = m3.0$coef[1]) +
  theme_minimal() + xlab('') + ylab('') +
  theme(axis.text.x = element_blank()) +
  xlim(c(0, 1.0))

ggsave('nonprop hazards.png', w=8, h=4)

#for ref this is what the graphic should look like
#plot(zp[1], resid=F)
#abline(0,0, col=2)
#abline(h= model.2$coef[1], col=3, lwd=2, lty=2)



